In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib
In [2]:
data = pd.read_csv("E:/insurance.csv")
In [3]:
data.head(5)
Out[3]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
In [5]:
data.age.describe()
Out[5]:
count    1338.000000
mean       39.207025
std        14.049960
min        18.000000
25%        27.000000
50%        39.000000
75%        51.000000
max        64.000000
Name: age, dtype: float64
In [6]:
import pandas as pd
In [7]:
data.smoker.value_counts()
Out[7]:
no     1064
yes     274
Name: smoker, dtype: int64
In [8]:
px.histogram(data, x='smoker', color='sex', title='Smoker')
In [9]:
fig = px.histogram(data, 
                   x='age', 
                   marginal='box', 
                   nbins=47, 
                   title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()
In [10]:
fig = px.histogram(data, 
                   x='bmi', 
                   marginal='box', 
                   color_discrete_sequence=['blue'], 
                   title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()
In [11]:
fig = px.histogram(data, 
                   x ='charges', 
                   marginal='box', 
                   color='smoker', 
                   color_discrete_sequence=['green','blue'], 
                   title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()
In [12]:
fig = px.scatter(data, 
                 x='age', 
                 y='charges', 
                 color='smoker',  
                 hover_data=['sex'], 
                 title='Age vs. Charges')
fig.update_traces(marker_size=5) #to emphasize each point with a specifice maker
fig.show()
In [18]:
fig = px.scatter(data, 
                 x='bmi', 
                 y='charges', 
                 color='smoker',  
                 hover_data=['sex'], 
                 title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
In [19]:
data.charges.corr(data.age)
Out[19]:
0.2990081933306474
In [20]:
data.charges.corr(data.bmi)
Out[20]:
0.19834096883362903
In [21]:
data.children.corr(data.charges)
Out[21]:
0.0679982268479047
In [22]:
data.corr()
C:\Users\Rajalaxmi Mohapatra\AppData\Local\Temp\ipykernel_10460\2627137660.py:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

Out[22]:
age bmi children charges
age 1.000000 0.109272 0.042469 0.299008
bmi 0.109272 1.000000 0.012759 0.198341
children 0.042469 0.012759 1.000000 0.067998
charges 0.299008 0.198341 0.067998 1.000000
In [23]:
sns.heatmap(data.corr(),cmap = 'Blues',annot = True)
plt.title = ("Co-reletation_matrix")
C:\Users\Rajalaxmi Mohapatra\AppData\Local\Temp\ipykernel_10460\3440050043.py:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

In [24]:
def estimate_charges(age, w, b):
    return w * age + b
In [25]:
##Let define a helper function estimate_charges, to compute  charges , given  age,  w and  b
##In the above case, the x axis shows "age" and the y axis shows "charges". Thus, we're assume the following relationship between the two:
##charges = w×age+b  

w = 50
b = 100
In [26]:
estimate_charges(30,w,b)
Out[26]:
1600
In [27]:
non_smoker_df = data[data.smoker == 'no']
In [31]:
ages = non_smoker_df.age
estimated_charges = estimate_charges(ages, w, b)
estimated_charges
Out[31]:
1       1000
2       1500
3       1750
4       1700
5       1650
        ... 
1332    2700
1333    2600
1334    1000
1335    1000
1336    1150
Name: age, Length: 1064, dtype: int64
In [32]:
non_smoker_df.charges
Out[32]:
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
5        3756.62160
           ...     
1332    11411.68500
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
Name: charges, Length: 1064, dtype: float64
In [33]:
plt.scatter(ages, estimated_charges);
plt.plot(ages,estimated_charges)
plt.xlabel('Age');
plt.ylabel('Estimated Charges');
In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
In [35]:
model = LinearRegression()
In [36]:
inputs = non_smoker_df[['age']]
targets = non_smoker_df.charges
print('inputs.shape :', inputs.shape)
print('targes.shape :', targets.shape)
inputs.shape : (1064, 1)
targes.shape : (1064,)
In [37]:
model.fit(inputs, targets)
Out[37]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [38]:
predictions = model.predict(inputs)
In [39]:
predictions
Out[39]:
array([2719.0598744 , 5391.54900271, 6727.79356686, ..., 2719.0598744 ,
       2719.0598744 , 3520.80661289])
In [48]:
import math 
from sklearn.metrics import mean_squared_error
In [54]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))
In [55]:
rmse(targets, predictions)
Out[55]:
4662.505766636395
In [56]:
model.coef_
Out[56]:
array([267.24891283])
In [57]:
model.intercept_
Out[57]:
-2091.4205565650827
In [58]:
model1 = SGDRegressor()
In [59]:
model1.fit(inputs,targets)
Out[59]:
SGDRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDRegressor()
In [60]:
predictions = model1.predict(inputs)
rmse(targets, predictions)
Out[60]:
5075.193593045272
In [62]:
#Model creation for smoker dataset
smoker_df = data[data.smoker == 'yes']
smoker_df
Out[62]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
11 62 female 26.290 0 yes southeast 27808.72510
14 27 male 42.130 0 yes southeast 39611.75770
19 30 male 35.300 0 yes southwest 36837.46700
23 34 female 31.920 1 yes northeast 37701.87680
... ... ... ... ... ... ... ...
1313 19 female 34.700 2 yes southwest 36397.57600
1314 30 female 23.655 3 yes northwest 18765.87545
1321 62 male 26.695 0 yes northeast 28101.33305
1323 42 female 40.370 2 yes southeast 43896.37630
1337 61 female 29.070 0 yes northwest 29141.36030

274 rows × 7 columns

In [63]:
inputs = smoker_df[['age']]
targets = smoker_df.charges
print('',inputs.shape)
print('',targets.shape)
 (274, 1)
 (274,)
In [64]:
model.fit(inputs,targets)
Out[64]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [65]:
predicted = model.predict(inputs)
print('RMSE:',)
rmse(targets,predicted)
RMSE:
Out[65]:
10711.00334810241
In [66]:
sns.barplot(data = data, x='smoker', y='charges');
In [67]:
smoker_codes = {'no': 0, 'yes': 1}
data['smoker_code'] = data.smoker.map(smoker_codes)
In [68]:
data.charges.corr(data.smoker_code)
Out[68]:
0.7872514304984761
In [69]:
data
Out[69]:
age sex bmi children smoker region charges smoker_code
0 19 female 27.900 0 yes southwest 16884.92400 1
1 18 male 33.770 1 no southeast 1725.55230 0
2 28 male 33.000 3 no southeast 4449.46200 0
3 33 male 22.705 0 no northwest 21984.47061 0
4 32 male 28.880 0 no northwest 3866.85520 0
... ... ... ... ... ... ... ... ...
1333 50 male 30.970 3 no northwest 10600.54830 0
1334 18 female 31.920 0 no northeast 2205.98080 0
1335 18 female 36.850 0 no southeast 1629.83350 0
1336 21 female 25.800 0 no southwest 2007.94500 0
1337 61 female 29.070 0 yes northwest 29141.36030 1

1338 rows × 8 columns

In [71]:
# Create inputs and targets
inputs, targets = data[['age', 'bmi', 'children', 'smoker_code']], data['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6056.439217188081
In [73]:
sns.barplot(data=data, x='sex', y='charges')
Out[73]:
<Axes: xlabel='sex', ylabel='charges'>
In [74]:
sex_codes = {'female': 0, 'male': 1}
In [75]:
data['sex_code'] = data.sex.map(sex_codes)
In [76]:
data
Out[76]:
age sex bmi children smoker region charges smoker_code sex_code
0 19 female 27.900 0 yes southwest 16884.92400 1 0
1 18 male 33.770 1 no southeast 1725.55230 0 1
2 28 male 33.000 3 no southeast 4449.46200 0 1
3 33 male 22.705 0 no northwest 21984.47061 0 1
4 32 male 28.880 0 no northwest 3866.85520 0 1
... ... ... ... ... ... ... ... ... ...
1333 50 male 30.970 3 no northwest 10600.54830 0 1
1334 18 female 31.920 0 no northeast 2205.98080 0 0
1335 18 female 36.850 0 no southeast 1629.83350 0 0
1336 21 female 25.800 0 no southwest 2007.94500 0 0
1337 61 female 29.070 0 yes northwest 29141.36030 1 0

1338 rows × 9 columns

In [79]:
data.charges.corr(data.sex_code)
Out[79]:
0.057292062202025464
In [80]:
# Create inputs and targets
inputs, targets = data[['age', 'bmi', 'children', 'smoker_code', 'sex_code']], data['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6056.100708754546
In [81]:
sns.barplot(data = data, x='region', y='charges');
In [83]:
#Model Improvements
#Feature Scaling
#Recall that due to regulatory requirements, we also need to explain the rationale behind the predictions our model.

#charges=w1×age+w2×bmi+w3×children+w4×smoker+w5×sex+w6×region+b
 
#To compare the importance of each feature in the model, our first instinct might be to compare their weights.
In [84]:
model.coef_
Out[84]:
array([  257.73498767,   322.36421449,   474.41112061, 23823.39253065,
        -128.63985357])
In [85]:
model.intercept_
Out[85]:
-12052.461985664726
In [89]:
data
Out[89]:
age sex bmi children smoker region charges smoker_code sex_code
0 19 female 27.900 0 yes southwest 16884.92400 1 0
1 18 male 33.770 1 no southeast 1725.55230 0 1
2 28 male 33.000 3 no southeast 4449.46200 0 1
3 33 male 22.705 0 no northwest 21984.47061 0 1
4 32 male 28.880 0 no northwest 3866.85520 0 1
... ... ... ... ... ... ... ... ... ...
1333 50 male 30.970 3 no northwest 10600.54830 0 1
1334 18 female 31.920 0 no northeast 2205.98080 0 0
1335 18 female 36.850 0 no southeast 1629.83350 0 0
1336 21 female 25.800 0 no southwest 2007.94500 0 0
1337 61 female 29.070 0 yes northwest 29141.36030 1 0

1338 rows × 9 columns

In [90]:
from sklearn.preprocessing import StandardScaler
In [91]:
numeric_cols = ['age', 'bmi', 'children'] 
scaler = StandardScaler()
scaler.fit(data[numeric_cols])
Out[91]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [92]:
scaler.mean_
Out[92]:
array([39.20702541, 30.66339686,  1.09491779])
In [93]:
scaler.var_
Out[93]:
array([197.25385199,  37.16008997,   1.45212664])
In [95]:
data[numeric_cols]
Out[95]:
age bmi children
0 19 27.900 0
1 18 33.770 1
2 28 33.000 3
3 33 22.705 0
4 32 28.880 0
... ... ... ...
1333 50 30.970 3
1334 18 31.920 0
1335 18 36.850 0
1336 21 25.800 0
1337 61 29.070 0

1338 rows × 3 columns

In [96]:
scaled_inputs = scaler.transform(data[numeric_cols])
scaled_inputs
Out[96]:
array([[-1.43876426, -0.45332   , -0.90861367],
       [-1.50996545,  0.5096211 , -0.07876719],
       [-0.79795355,  0.38330685,  1.58092576],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367],
       [-1.29636188, -0.79781341, -0.90861367],
       [ 1.55168573, -0.26138796, -0.90861367]])
In [97]:
from sklearn.model_selection import train_test_split
In [98]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.2,random_state=0)
In [99]:
# Create and train the model
model = LinearRegression().fit(inputs_train, targets_train)

# Generate predictions
predictions_test = model.predict(inputs_test)

# Compute loss to evalute the model
loss = rmse(targets_test, predictions_test)
print('Test Loss:', loss)
Test Loss: 5671.492452926755
In [100]:
# Generate predictions
predictions_train = model.predict(inputs_train)

# Compute loss to evalute the model
loss = rmse(targets_train, predictions_train)
print('Training Loss:', loss)
Training Loss: 6150.508349257895
In [ ]: